CRISPDM Framework¶

Image source: ftp://public.dhe.ibm.com/software/analytics/spss/documentation/modeler/18.0/en/ModelerCRISPDM.pdf

image-2.png

Data Understanding¶

https://www.kaggle.com/datasets/airbnb/seattle

https://www.kaggle.com/datasets/airbnb/boston

The following Airbnb activity is included in this dataset:

  1. Listings, including full descriptions and average review score
  2. Reviews, including unique id for each reviewer and detailed comments
  3. Calendar, including listing id and the price and availability for that day

Import Library¶

In [1]:
import pandas as pd
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import sweetviz as sv
from io import StringIO
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

Download Dataset¶

In [ ]:
!kaggle datasets download -d airbnb/seattle
!kaggle datasets download -d airbnb/boston

!unzip seattle.zip
!unzip boston.zip

Reading Dataset¶

In [2]:
def flag_city(df,city):
    df["city"]=city
    return df

def concat(df1,df2):
    return pd.concat([df1[df1.columns], df2[df1.columns]])

def to_date(df,columns):
    for col in columns:
        df[col] = pd.to_datetime(df[col]).dt.strftime('%Y-%m-%d')
    return df

def price_to_numeric(df,columns):
    for col in columns:
        df[col] = pd.to_numeric(df[col].str.replace('$', '').str.replace(',', ''))
    return df
In [115]:
calendar_dtype = {'listing_id': str}
listing_dtype = {"id": str,"hos_id": str}
reviews_dtype = {"id": str, 'listing_id': str,'reviewer_id': str}


seattle_calendar = flag_city(pd.read_csv("./seattle/calendar.csv",dtype= calendar_dtype),"Seattle")
seattle_listings = flag_city(pd.read_csv("./seattle/listings.csv",dtype= listing_dtype),"Seattle")
seattle_reviews = flag_city(pd.read_csv("./seattle/reviews.csv",dtype= reviews_dtype),"Seattle")

boston_calendar = flag_city(pd.read_csv("./boston/calendar.csv",dtype= calendar_dtype),"Boston")
boston_listings = flag_city(pd.read_csv("./boston/listings.csv",dtype= listing_dtype),"Boston")
boston_reviews = flag_city(pd.read_csv("./boston/reviews.csv",dtype= reviews_dtype),"Boston")
In [282]:
calendar = pd.concat([seattle_calendar, boston_calendar])
calendar = to_date(calendar,["date"])
calendar = price_to_numeric(calendar,["price"])

listings = pd.concat([seattle_listings, boston_listings])
listings = to_date(listings,["host_since","first_review","last_review"])
listings = price_to_numeric(listings,["price","weekly_price","monthly_price","cleaning_fee"])

superhost_cond = (
    listings["host_is_superhost"] == 't',
    listings["host_is_superhost"] == 'f',
)
listings["host_is_superhost"] = np.select(superhost_cond, ['Superhost', 'Non-Superhost'], default=None)

listings[listings["host_is_superhost"]=='t']["host_is_superhost"] = 'Superhost'
listings[listings["host_is_superhost"]=='f']["host_is_superhost"] = 'Non-Superhost'

reviews = pd.concat([seattle_reviews, boston_reviews])
reviews = to_date(reviews,["date"])
In [45]:
print(f"""
    Seattle Data
    Number of observation: {np.shape(calendar[calendar["city"]=="Seattle"])[0]} records
    Fist observation: {str(calendar[calendar["city"]=="Seattle"]["date"].min())}
    Last observation: {str(calendar[calendar["city"]=="Seattle"]["date"].max())}
    
    Total Host: {np.shape(listings[listings["city"]=="Seattle"]["host_id"].unique())[0]}
    Total Listing: {np.shape(listings[listings["city"]=="Seattle"]["id"].unique())[0]}

""")


print(f"""
    Boston Data
    Number of observation: {np.shape(calendar[calendar["city"]=="Boston"])[0]} records
    Fist observation: {str(calendar[calendar["city"]=="Boston"]["date"].min())}
    Last observation: {str(calendar[calendar["city"]=="Boston"]["date"].max())}
    
    Total Host: {np.shape(listings[listings["city"]=="Boston"]["host_id"].unique())[0]}
    Total Listing: {np.shape(listings[listings["city"]=="Boston"]["id"].unique())[0]}
""")
    Seattle Data
    Number of observation: 1393570 records
    Fist observation: 2016-01-04
    Last observation: 2017-01-02
    
    Total Host: 2751
    Total Listing: 3818



    Boston Data
    Number of observation: 1308890 records
    Fist observation: 2016-09-06
    Last observation: 2017-09-05
    
    Total Host: 2181
    Total Listing: 3585

EDA¶

In [12]:
analyze_report = sv.analyze(calendar)
analyze_report.show_html("calendar.html",open_browser=True)
In [13]:
analyze_report = sv.analyze(listings)
analyze_report.show_html("listings.html",open_browser=True)
In [14]:
analyze_report = sv.analyze(reviews)
analyze_report.show_html("reviews.html",open_browser=True)

How is property distribution listed in Seattle compared to Boston?¶

In [167]:
df[df["city"]=="Boston"]["price"].describe()
Out[167]:
count    2748.000000
mean      167.188137
std       113.097098
min        11.000000
25%        85.000000
50%       146.000000
75%       215.000000
max      1300.000000
Name: price, dtype: float64
In [168]:
df[df["city"]=="Seattle"]["price"].describe()
Out[168]:
count    3166.000000
mean      126.844599
std        90.068497
min        22.000000
25%        75.000000
50%       100.000000
75%       150.000000
max      1000.000000
Name: price, dtype: float64
In [46]:
import plotly.figure_factory as ff

df = listings[listings["price"]<1500]

hist_data = [df[df["city"]=="Boston"]["price"], df[df["city"]=="Seattle"]["price"]]

group_labels = ['Boston', 'Seattle']

# Create distplot with curve_type set to 'normal'
fig = ff.create_distplot(
    hist_data, group_labels,
    show_hist=False,
)

fig.update_layout(title_text='Listing Price Distribution',template='simple_white',xaxis_title="Price",)
fig.show()

Is there any pricing difference for particular property type between those cities?¶

In [264]:
majority_type = ["House","Apartment","Condominium","Townhouse", "Bed & Breakfast","Loft"]
fig = px.box(listings[listings["property_type"].isin(majority_type)],
             y="property_type", x="price",color="city",
             orientation='h'
)
fig.update_layout(
    title_text='Price by Type',template='simple_white',
    yaxis_title="Type",
    xaxis_title="Price",
    showlegend=True,
)
fig.update_layout(xaxis_range=[0,800])
fig.show()
In [100]:
import plotly.express as px
from plotly.subplots import make_subplots

df_seattle = listings[listings["city"]=="Seattle"]
df_seattle = (
    df_seattle[["city","property_type","id","price"]]
    .groupby(['city','property_type'])
    .agg({'id':'size', 'price':'mean'}) 
    .reset_index()
).sort_values(by='id', ascending=True)

# Create distplot with curve_type set to 'normal'
fig_seattle = px.bar(df_seattle,
             y="property_type", x="id",color="price",text="price",
             orientation='h'
).update_layout(
    title_text='Seattle Property Type',template='simple_white',
    yaxis={'categoryorder':'total ascending'},
    yaxis_title="Property Type",
    xaxis_title="Number of Listing",
).update_traces(texttemplate='%{text:.2s}')

df_boston = listings[listings["city"]=="Boston"]
df_boston = (
    df_boston[["city","property_type","id","price"]]
    .groupby(['city','property_type'])
    .agg({'id':'size', 'price':'mean'}) 
    .reset_index()
).sort_values(by='id', ascending=True)

# Create distplot with curve_type set to 'normal'
fig_boston = px.bar(df_boston,
             y="property_type", x="id",color="price",text="price",
             orientation='h'
).update_layout(
    title_text='Boston Property Type',template='simple_white',
    yaxis={'categoryorder':'total ascending'},
    yaxis_title="Property Type",
    xaxis_title="Number of Listing",
).update_traces(texttemplate='%{text:.2s}')

figure1_traces = []
figure2_traces = []

for trace in range(len(fig_seattle["data"])):
    figure1_traces.append(fig_seattle["data"][trace])
for trace in range(len(fig_boston["data"])):
    figure2_traces.append(fig_boston["data"][trace])

fig_subplot = make_subplots(rows=1, cols=2, subplot_titles=('Seattle',  'Boston'))
for traces in figure1_traces:
    fig_subplot.append_trace(traces, row=1, col=1)
for traces in figure2_traces:
    fig_subplot.append_trace(traces, row=1, col=2)

fig_subplot.update_layout(
    title_text='Property Type (color indicate price)',template='simple_white',
    yaxis_title="Property Type",
    xaxis_title="Number of Listing",
    showlegend=True,
).update_traces(texttemplate='%{text:.2s}')

fig_subplot['layout']['xaxis']['title']='Number of Listing'
fig_subplot['layout']['xaxis2']['title']='Number of Listing'

fig_subplot.show()

How is the average rating differ between those cities, by property type?¶

In [55]:
import plotly.figure_factory as ff

df = listings[(listings["review_scores_rating"].notnull()) & (listings["review_scores_rating"]> 50)]

hist_data = [df[df["city"]=="Boston"]["review_scores_rating"], df[df["city"]=="Seattle"]["review_scores_rating"]]

group_labels = ['Boston', 'Seattle']

# Create distplot with curve_type set to 'normal'
fig = ff.create_distplot(
    hist_data, group_labels,
    show_hist=False,
)

fig.update_layout(title_text='Overall Rating Distribution',template='simple_white',xaxis_title="Review Score",)
fig.show()
In [267]:
import plotly.express as px
from plotly.subplots import make_subplots

df_seattle = listings[listings["city"]=="Seattle"]
df_seattle = (
    df_seattle[["city","property_type","id","review_scores_rating","number_of_reviews"]]
    .groupby(['city','property_type'])
    .agg({'id':'size', 'review_scores_rating':'mean',"number_of_reviews": "sum"}) 
    .reset_index()
).sort_values(by='review_scores_rating', ascending=True)

# Create distplot with curve_type set to 'normal'
fig_seattle = px.bar(df_seattle,
             y="property_type", x="review_scores_rating",color="number_of_reviews",text="review_scores_rating",
             orientation='h'
).update_layout(
    title_text='Seattle Property Type',template='simple_white',
    yaxis={'categoryorder':'total ascending'},
    yaxis_title="Property Type",
    xaxis_title="Number of Listing",
).update_traces(texttemplate='%{text:.2s}')

df_boston = listings[listings["city"]=="Boston"]
df_boston = (
    df_boston[["city","property_type","id","review_scores_rating","number_of_reviews"]]
    .groupby(['city','property_type'])
    .agg({'id':'size', 'review_scores_rating':'mean',"number_of_reviews": "sum"}) 
    .reset_index()
).sort_values(by='review_scores_rating', ascending=True)
df_boston = df_boston[df_boston["review_scores_rating"].notnull()]

# Create distplot with curve_type set to 'normal'
fig_boston = px.bar(df_boston,
             y="property_type", x="review_scores_rating",color="number_of_reviews",text="review_scores_rating",
             orientation='h'
).update_layout(
    title_text='Boston Property Type',template='simple_white',
    yaxis={'categoryorder':'total ascending'},
    yaxis_title="Property Type",
    xaxis_title="Number of Listing",
).update_traces(texttemplate='%{text:.2s}')

figure1_traces = []
figure2_traces = []

for trace in range(len(fig_seattle["data"])):
    figure1_traces.append(fig_seattle["data"][trace])
for trace in range(len(fig_boston["data"])):
    figure2_traces.append(fig_boston["data"][trace])

fig_subplot = make_subplots(rows=1, cols=2, subplot_titles=('Seattle',  'Boston'),horizontal_spacing = 0.15)
for traces in figure1_traces:
    fig_subplot.append_trace(traces, row=1, col=1)
for traces in figure2_traces:
    fig_subplot.append_trace(traces, row=1, col=2)

fig_subplot.update_layout(
    title_text='Review by Type (color indicate number of review)',template='simple_white',
    yaxis_title="Property Type",
    xaxis_title="Number of Listing",
    showlegend=True,
)

fig_subplot['layout']['xaxis']['title']='Average Rating'
fig_subplot['layout']['xaxis2']['title']='Average Rating'

fig_subplot.show()

Is higher score due to lower price?¶

In [ ]:
 
In [261]:
correlation = listings[listings["city"]=="Seattle"].corr().reset_index()
correlation["absolute_corr"] = np.abs(correlation["review_scores_rating"]).fillna(0)
correlation = correlation.sort_values(by=['absolute_corr'], ascending=False)[["index","review_scores_rating"]].reset_index(drop=True)
correlation = correlation.rename(columns={"index": "factor"})
correlation = correlation.loc[2:15,:].reset_index(drop=True)
correlation.index += 1
correlation
Out[261]:
factor review_scores_rating
1 review_scores_cleanliness 0.642882
2 review_scores_accuracy 0.621257
3 review_scores_communication 0.540620
4 review_scores_checkin 0.521813
5 review_scores_location 0.368423
6 calculated_host_listings_count -0.219280
7 square_feet 0.143793
8 host_listings_count -0.109357
9 host_total_listings_count -0.109357
10 reviews_per_month 0.087313
11 price 0.055551
12 availability_30 -0.051439
13 monthly_price 0.048595
14 availability_60 -0.048460
In [260]:
correlation = listings[listings["city"]=="Boston"].corr().reset_index()
correlation["absolute_corr"] = np.abs(correlation["review_scores_rating"]).fillna(0)
correlation = correlation.sort_values(by=['absolute_corr'], ascending=False)[["index","review_scores_rating"]].reset_index(drop=True)
correlation = correlation.rename(columns={"index": "factor"})
correlation = correlation.loc[2:15,:].reset_index(drop=True)
correlation.index += 1
correlation
Out[260]:
factor review_scores_rating
1 review_scores_cleanliness 0.754327
2 review_scores_accuracy 0.705104
3 review_scores_communication 0.600599
4 review_scores_checkin 0.584034
5 review_scores_location 0.458901
6 square_feet -0.175922
7 availability_30 -0.147676
8 calculated_host_listings_count -0.142220
9 availability_60 -0.138288
10 host_listings_count -0.125055
11 host_total_listings_count -0.125055
12 availability_90 -0.123666
13 weekly_price 0.109238
14 price 0.105651
In [183]:
fig = px.scatter(listings, x="review_scores_rating", y="price", color="city")
fig.update_layout(yaxis_range=[0,1000])
fig.show()

Is some there particular neighbour with high rating?¶

In [164]:
import plotly.express as px
from plotly.subplots import make_subplots

df_seattle = listings[listings["city"]=="Seattle"]
df_seattle = (
    df_seattle[["city","neighbourhood_cleansed","review_scores_rating","number_of_reviews"]]
    .groupby(['city','neighbourhood_cleansed'])
    .agg({'review_scores_rating':'mean',"number_of_reviews": "sum"}) 
    .reset_index()
).sort_values(by='review_scores_rating', ascending=True)
df_seattle = (
    df_seattle[df_seattle["review_scores_rating"].notnull()]
    .sort_values(by='review_scores_rating', ascending=True)
    .tail(20)
)

# Create distplot with curve_type set to 'normal'
fig_seattle = px.bar(df_seattle,
             y="neighbourhood_cleansed", x="review_scores_rating",color="number_of_reviews",text="review_scores_rating",
             orientation='h'
).update_layout(
    title_text='Seattle Property Type',template='simple_white',
    yaxis={'categoryorder':'total ascending'},
    yaxis_title="Property Type",
    xaxis_title="Number of Listing",
).update_traces(texttemplate='%{text:.2s}')

df_boston = listings[listings["city"]=="Boston"]
df_boston = (
    df_boston[["city","neighbourhood_cleansed","review_scores_rating","number_of_reviews"]]
    .groupby(['city','neighbourhood_cleansed'])
    .agg({'review_scores_rating':'mean',"number_of_reviews": "sum"}) 
    .reset_index()
).sort_values(by='review_scores_rating', ascending=True)
df_boston = (
    df_boston[df_boston["review_scores_rating"].notnull()]
    .sort_values(by='review_scores_rating', ascending=True)
    .tail(20)
)

# Create distplot with curve_type set to 'normal'
fig_boston = px.bar(df_boston,
             y="neighbourhood_cleansed", x="review_scores_rating",color="number_of_reviews",text="review_scores_rating",
             orientation='h'
).update_layout(
    title_text='Boston Property Type',template='simple_white',
    yaxis={'categoryorder':'total ascending'},
    yaxis_title="Property Type",
    xaxis_title="Number of Listing",
).update_traces(texttemplate='%{text:.2s}')

figure1_traces = []
figure2_traces = []

for trace in range(len(fig_seattle["data"])):
    figure1_traces.append(fig_seattle["data"][trace])
for trace in range(len(fig_boston["data"])):
    figure2_traces.append(fig_boston["data"][trace])

fig_subplot = make_subplots(rows=1, cols=2, subplot_titles=('Seattle',  'Boston'),horizontal_spacing = 0.22)
for traces in figure1_traces:
    fig_subplot.append_trace(traces, row=1, col=1)
for traces in figure2_traces:
    fig_subplot.append_trace(traces, row=1, col=2)

fig_subplot.update_layout(
    title_text='Review by Type (color indicate number of review)',template='simple_white',
    yaxis_title="Property Type",
    xaxis_title="Number of Listing",
    showlegend=True,
)

fig_subplot['layout']['xaxis']['title']='Average Rating'
fig_subplot['layout']['xaxis2']['title']='Average Rating'

fig_subplot.show()

Superhost vs Non-Superhost¶

Price¶

In [333]:
df_seattle = listings[
    (listings["host_is_superhost"].notnull()) &
    (listings["city"] == "Seattle") &
    (listings["price"] < 800)
]

majority_type = ["House","Apartment","Condominium","Townhouse", "Bed & Breakfast","Loft"]
fig_seattle = px.box(df_seattle[df_seattle["property_type"].isin(majority_type)],
             y="property_type", x="price",color="host_is_superhost",
             orientation='h',
)
fig_seattle.update_layout(
    title_text='Price by Type',template='simple_white',
    yaxis_title="Type",
    xaxis_title="Price",
)

df_boston = listings[
    (listings["host_is_superhost"].notnull()) &
    (listings["city"] == "Boston") &
    (listings["price"] < 800)
]

majority_type = ["House","Apartment","Condominium","Townhouse", "Bed & Breakfast","Loft"]
fig_boston = px.box(df_boston[df_boston["property_type"].isin(majority_type)],
             y="property_type", x="price",color="host_is_superhost",
             orientation='h'
)
fig_boston.update_layout(
    title_text='Price by Type',template='simple_white',
    yaxis_title="Type",
    xaxis_title="Price",
    showlegend=False,
)

figure1_traces = []
figure2_traces = []

for trace in range(len(fig_seattle["data"])):
    figure1_traces.append(fig_seattle["data"][trace])
for trace in range(len(fig_boston["data"])):
    figure2_traces.append(fig_boston["data"][trace])

fig_subplot = make_subplots(rows=1, cols=2, subplot_titles=('Seattle',  'Boston'),horizontal_spacing = 0.15,vertical_spacing = 1)
for traces in figure1_traces:
    fig_subplot.append_trace(traces, row=1, col=1)
for traces in figure2_traces:
    fig_subplot.append_trace(traces, row=1, col=2)

fig_subplot.update_layout(
    title_text='Price Superhost vs Non-Superhost',template='simple_white',
    yaxis_title="Type",
    xaxis_title="Number of Listing",
    boxmode='group',
)

fig_subplot['layout']['xaxis']['title']='Price'
fig_subplot['layout']['xaxis2']['title']='Price'
fig_subplot.update_layout(legend=dict(
    orientation="h",
))

fig.update_layout(
   title="My plot",
   xaxis_title="id",
   yaxis_title="salary",
   legend_title="legend",
   font=dict(family="Arial", size=20, color="green")
)
fig_subplot.show()

Rating¶

In [345]:
df = listings[listings["city"]=="Seattle"]
df[['host_is_superhost',
         'review_scores_rating','review_scores_accuracy', 'review_scores_cleanliness',
         'review_scores_checkin', 'review_scores_communication',
         'review_scores_location', 'review_scores_value'
]].groupby(['host_is_superhost']).describe().T.unstack(1)
Out[345]:
host_is_superhost Non-Superhost Superhost
count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max
review_scores_rating 2420.0 93.652893 7.192689 20.0 91.0 95.0 100.0 100.0 751.0 97.395473 2.624384 60.0 96.0 98.0 99.0 100.0
review_scores_accuracy 2409.0 9.552511 0.760750 2.0 9.0 10.0 10.0 10.0 751.0 9.905459 0.314722 8.0 10.0 10.0 10.0 10.0
review_scores_cleanliness 2414.0 9.454018 0.864743 3.0 9.0 10.0 10.0 10.0 751.0 9.885486 0.365425 6.0 10.0 10.0 10.0 10.0
review_scores_checkin 2409.0 9.731424 0.663235 2.0 10.0 10.0 10.0 10.0 751.0 9.964048 0.200098 8.0 10.0 10.0 10.0 10.0
review_scores_communication 2416.0 9.758278 0.635341 2.0 10.0 10.0 10.0 10.0 751.0 9.974700 0.165406 8.0 10.0 10.0 10.0 10.0
review_scores_location 2412.0 9.564677 0.661469 4.0 9.0 10.0 10.0 10.0 751.0 9.750999 0.485025 6.0 10.0 10.0 10.0 10.0
review_scores_value 2411.0 9.359602 0.795533 2.0 9.0 9.0 10.0 10.0 751.0 9.749667 0.471787 6.0 10.0 10.0 10.0 10.0
In [348]:
import plotly
plotly.offline.init_notebook_mode()
In [ ]:
 
In [ ]: